In [89]:
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly 
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

%matplotlib notebook
In [2]:
df = pd.read_excel("./dataset.xlsx")
df.head()
Out[2]:
status card_present_flag bpay_biller_code account currency long_lat txn_description merchant_id merchant_code first_name ... age merchant_suburb merchant_state extraction amount transaction_id country customer_id merchant_long_lat movement
0 authorized 1.0 NaN ACC-1598451071 AUD 153.41 -27.95 POS 81c48296-73be-44a7-befa-d053f48ce7cd NaN Diana ... 26 Ashmore QLD 2018-08-01T01:01:15.000+0000 16.25 a623070bfead4541a6b0fff8a09e706c Australia CUS-2487424745 153.38 -27.99 debit
1 authorized 0.0 NaN ACC-1598451071 AUD 153.41 -27.95 SALES-POS 830a451c-316e-4a6a-bf25-e37caedca49e NaN Diana ... 26 Sydney NSW 2018-08-01T01:13:45.000+0000 14.19 13270a2a902145da9db4c951e04b51b9 Australia CUS-2487424745 151.21 -33.87 debit
2 authorized 1.0 NaN ACC-1222300524 AUD 151.23 -33.94 POS 835c231d-8cdf-4e96-859d-e9d571760cf0 NaN Michael ... 38 Sydney NSW 2018-08-01T01:26:15.000+0000 6.42 feb79e7ecd7048a5a36ec889d1a94270 Australia CUS-2142601169 151.21 -33.87 debit
3 authorized 1.0 NaN ACC-1037050564 AUD 153.10 -27.66 SALES-POS 48514682-c78a-4a88-b0da-2d6302e64673 NaN Rhonda ... 40 Buderim QLD 2018-08-01T01:38:45.000+0000 40.90 2698170da3704fd981b15e64a006079e Australia CUS-1614226872 153.05 -26.68 debit
4 authorized 1.0 NaN ACC-1598451071 AUD 153.41 -27.95 SALES-POS b4e02c10-0852-4273-b8fd-7b3395e32eb0 NaN Diana ... 26 Mermaid Beach QLD 2018-08-01T01:51:15.000+0000 3.25 329adf79878c4cf0aeb4188b4691c266 Australia CUS-2487424745 153.44 -28.06 debit

5 rows × 23 columns

In [3]:
df.describe()
Out[3]:
card_present_flag merchant_code balance age amount
count 7717.000000 883.0 12043.000000 12043.000000 12043.000000
mean 0.802644 0.0 14704.195553 30.582330 187.933588
std 0.398029 0.0 31503.722652 10.046343 592.599934
min 0.000000 0.0 0.240000 18.000000 0.100000
25% 1.000000 0.0 3158.585000 22.000000 16.000000
50% 1.000000 0.0 6432.010000 28.000000 29.000000
75% 1.000000 0.0 12465.945000 38.000000 53.655000
max 1.000000 0.0 267128.520000 78.000000 8835.980000
In [4]:
# Inital exploration into target customers regarding age 
fig = px.box(df, y="age")
fig.show()
In [5]:
# Analyzingoutliers in amount
df2 = df[df["amount"]<200]
df2.sort_values(by="amount",axis=0, ascending=False)
Out[5]:
status card_present_flag bpay_biller_code account currency long_lat txn_description merchant_id merchant_code first_name ... age merchant_suburb merchant_state extraction amount transaction_id country customer_id merchant_long_lat movement
1273 posted NaN NaN ACC-1973887809 AUD 115.78 -31.90 PAYMENT NaN NaN Antonio ... 26 NaN NaN 2018-08-10T18:00:00.000+0000 199.0 d5958dbb0754478ebf98e7d9b26bc195 Australia CUS-1646621553 NaN debit
5226 posted NaN NaN ACC-1973887809 AUD 115.78 -31.90 PAYMENT NaN NaN Antonio ... 26 NaN NaN 2018-09-10T18:00:00.000+0000 199.0 e6f7e7d915064d5bb657eecdd1e9ddf0 Australia CUS-1646621553 NaN debit
9227 posted NaN NaN ACC-1973887809 AUD 115.78 -31.90 PAYMENT NaN NaN Antonio ... 26 NaN NaN 2018-10-10T18:00:00.000+0000 199.0 0776ec5925cc4b53a657db4ddaafca33 Australia CUS-1646621553 NaN debit
2020 authorized 1.0 NaN ACC-3771436525 AUD 145.04 -37.92 SALES-POS c51ef0f5-d6ef-4b80-8217-cd059aee74e1 NaN Derek ... 20 Sydney NSW 2018-08-17T15:32:57.000+0000 198.5 490d71dace6a421bae8bfa6203070553 Australia CUS-3325710106 151.2 -33.87 debit
3329 posted NaN NaN ACC-4059612845 AUD 130.98 -12.49 PAYMENT NaN NaN Jonathan ... 38 NaN NaN 2018-08-27T13:00:00.000+0000 198.0 abf0e03d25624f818b2bad0035ef3e77 Australia CUS-495599312 NaN debit
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6592 authorized 1.0 NaN ACC-2265520058 AUD 151.27 -33.85 POS 3e900109-863e-431a-8287-2ff69fe3b8f2 NaN Catherine ... 26 Woollahra NSW 2018-09-20T21:49:05.000+0000 0.1 dd9a6c8d1c454705a62e7a67592698be Australia CUS-331942311 151.25 -33.89 debit
6428 authorized 1.0 NaN ACC-3541460373 AUD 145.00 -37.83 POS a6a341db-200c-4833-a93d-893d18b019ea NaN Jeffrey ... 24 Southbank VIC 2018-09-19T17:21:28.000+0000 0.1 392deeb7fff748f48b19ad95381e8aa2 Australia CUS-1433879684 144.96 -37.82 debit
5395 authorized 0.0 NaN ACC-211792489 AUD 138.63 -34.93 SALES-POS 3b6372a5-9b14-4d98-bb58-92c215bca597 NaN Jacqueline ... 30 Melbourne VIC 2018-09-12T09:02:37.000+0000 0.1 daab7b7e1e5c4503b87d96358d9b4bee Australia CUS-497688347 144.96 -37.81 debit
8188 authorized 1.0 NaN ACC-1598451071 AUD 153.41 -27.95 SALES-POS 03b8bb86-abbd-4769-87e5-a78a0c13b95e NaN Diana ... 26 Broadbeach QLD 2018-10-03T03:19:59.000+0000 0.1 4f3a5aff070e43fcaeebd0ae0cd24caa Australia CUS-2487424745 153.43 -28.03 debit
11523 authorized 0.0 NaN ACC-3771436525 AUD 145.04 -37.92 SALES-POS bc513677-92d4-4ed4-b8d1-9a35a806e335 NaN Derek ... 20 Malvern VIC 2018-10-27T18:53:02.000+0000 0.1 cce50d62571a417f8df6f74e9e11072c Australia CUS-3325710106 145.04 -37.85 debit

10936 rows × 23 columns

In [6]:
# Intial exploration into transaction salesin different states
# There were outliers in some amounts
df = df[df["amount"] < 200]
m_c = df["merchant_state"].dropna().unique()

bar_df = df.groupby("merchant_state").agg({"amount":np.sum})

fig= px.bar(bar_df, x=bar_df.index, y='amount',  color=m_c, labels={"amount":"Total Sales in AUD $", "merchant_state":"Merchant States"})
fig.show()
In [7]:
# Explortions on sales for individual days
df_date = df.groupby("date").agg({"amount":np.sum}).reset_index()

fig = px.line(df_date, x="date", y="amount", labels={"amount":"Total Sales in AUD $"})
fig.show()
In [8]:
# Explortions on sales for individual months

# Convert dates to months
month_dict = {8:"August", 9:"September",10:"October"}
df["month"]= df["date"].dt.month
df["month"] = df["month"].apply(lambda x : month_dict.get(x))

df_month = df.groupby("month").agg({"amount":np.sum}).reset_index()

fig= px.histogram(df_month, x="month", y="amount", color ="month", template="plotly_dark",
                 labels={"amount": "Total Sales in AUD $"})
fig.show()
In [9]:
# Constructing a side by side comparisons with total sales

fig = make_subplots(
    rows=1, cols=2,
    column_widths=[0.6, 0.4],
    row_heights=[0.4],
    specs=[[{"type": "bar"}, {"type": "bar"}]]
)

fig.add_trace(
    go.Bar(x=df_date["date"], y=df_date["amount"] ,showlegend=False),
    row=1, col=1
)

fig.add_trace(
    go.Bar(x=df_month["month"], y=df_month["amount"] ,showlegend=False,marker={"color" :"seagreen"}),
    row=1, col=2
)

fig.update_layout(title_text="Side by Side Comaprisons of Total Sales: Days vs Months")

fig.show()
In [67]:
# Explortions on avergae sales for individual days
df_date_av = df.groupby("date").agg({"amount":np.mean}).reset_index()

fig = px.line(df_date_av, x="date", y="amount", labels={"amount":"Average Sales in AUD $"})
fig.show()
In [68]:
df_month_av = df.groupby("month").agg({"amount":np.mean}).reset_index()

fig= px.histogram(df_month_av, x="month", y="amount", color ="month", template="plotly_dark",
                 labels={"amount": "Average Sales in AUD $"})
fig.show()
In [69]:
# Constructing a side by side comparisons with total sales

fig = make_subplots(
    rows=1, cols=2,
    column_widths=[0.6, 0.4],
    row_heights=[0.4],
    specs=[[{"type": "bar"}, {"type": "bar"}]]
)

fig.add_trace(
    go.Bar(x=df_date_av["date"], y=df_date_av["amount"] ,showlegend=False),
    row=1, col=1
)

fig.add_trace(
    go.Bar(x=df_month_av["month"], y=df_month_av["amount"] ,showlegend=False,marker={"color" :"seagreen"}),
    row=1, col=2
)

fig.update_layout(title_text="Side by Side Comaprisons of Average Sales: Days vs Months")

fig.show()
In [70]:
# Needed to scale values to compare average and total sales per day.
scaler = MinMaxScaler()

# Scaled df_date
df_date_scaled = pd.DataFrame(scaler.fit_transform(df_date["amount"].values.reshape(-1, 1)), columns=["amount"])
# Scaled df_date_av
df_date_av_scaled = pd.DataFrame(scaler.fit_transform(df_date_av["amount"].values.reshape(-1, 1)), columns=["amount"])
In [71]:
# Constructing a side by side comparisons with total sales and average sales per day
# Utilize scaled valuesfor comparisons

fig = make_subplots(
    specs=[[{"secondary_y": True}]]
)

fig.add_trace(
    go.Bar(name="Total Sales",x=df_date["date"], y=df_date_scaled["amount"] ,marker={"color" :"green"}),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(name="Average Sales", x=df_date_av["date"], y=df_date_av_scaled["amount"] ,marker={"color" :"darkslateblue"}),
    row=1, col=1
)

fig.update_layout(title_text="Variaition of Average Sales and Total Sales each Day", yaxis_title="Scaled Values of Amount in AUD $")
fig.update_yaxes(showticklabels=False)
fig.show()
In [63]:
# Analysis of total sales by tax description
df_tax_d = df.groupby("txn_description").agg({"amount":np.sum})

fig = px.pie(df_tax_d, values='amount', names=df_tax_d.index, title='Sales by Tax Description', 
             color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()
In [103]:
# Analyzing sales by total days

df_days=df.copy()
df_days["day"] = df["date"].dt.day
df_day_g = df_days.groupby("day").agg({"amount":np.sum})

fig= px.line(df_day_g, x=df_day_g.index, y="amount", labels={"amount": "Total Sales for a specific day in AUD $"})
fig.show()
In [131]:
# Analyzind data by laitude and longitude as a 3d visualization

df_lat_lon = pd.DataFrame(df["long_lat"].str.split(' ').tolist(),
                                 columns = ['lon','lat'])

df_geol = df.copy()
df_geol["lon"] = df_lat_lon["lon"]
df_geol["lat"] = df_lat_lon["lat"]

df_geol=df_geol[["lat","lon","amount"]]

fig = go.Figure(data=[go.Surface(z=df_geol)])
fig.update_traces(contours_z=dict(show=True, usecolormap=True,
                                  highlightcolor="limegreen", project_z=True))
fig.show()
In [139]:
# Analyzind data by laitude and longitude
df_lat_lon = pd.DataFrame(df["long_lat"].str.split(' ').tolist(),
                                 columns = ['lon','lat'])

df_geol = df.copy()

df_g_g = df_geol.groupby(["long_lat"]).agg({"amount":np.sum})

fig = px.bar(df_g_g, x=df_g_g.index, y="amount", color="amount")
fig.update_layout(title_text="Total Sales by coordinates")
fig.update_yaxes(title="Total Amount in AUD $")
fig.update_xaxes(title="Latitude and Longitude")

fig.show()

Summary After Data Explolaration

Interesting findings

  • The merchant NSW has the most sales.
  • The key demographics tagerted is 20 to 40 years of age.
  • The highest income of sales is generated when the averge sales are low for the day, meaning that discounts help sales.
  • The most sales occur on the 17th day of the month.
In [ ]: